Data Import and Processing
These analyses pull data from production and development DHIS2 environments to assess client movement.
Data look like this:
#read file from production and view
raw4<-suppressMessages(read_csv("BrianDs.csv"))
#View(raw4)
#colnames(raw4)
head(raw4)
## # A tibble: 6 x 8
## EnrOrgUnit ENrUserType TeiUid StageUid VisitNo EvtOrgUnit EvtUserType GestAge
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 uyaWnaBmAs1 HA rbodU~ Ty22Qt2~ 1 uyaWnaBmA~ HA 25.3
## 2 uyaWnaBmAs1 HA kkpNN~ Ty22Qt2~ 1 uyaWnaBmA~ HA 13.4
## 3 uyaWnaBmAs1 HA v14CD~ Ty22Qt2~ 1 uyaWnaBmA~ HA 19.4
## 4 uyaWnaBmAs1 HA vd7bT~ Ty22Qt2~ 1 uyaWnaBmA~ HA 25.1
## 5 uyaWnaBmAs1 HA trPZE~ Ty22Qt2~ 1 uyaWnaBmA~ HA 25.3
## 6 gtUM8Eraqvu HA TdT1e~ Ty22Qt2~ 1 gtUM8Eraq~ HA 21.4
Data from development environment
#now get background data from dev
#get Org Unit groups
baseurl<-"https://bd-eregistry.dhis2.org/dhis/"
username<-"ing_test"
#function for logging in
loginDHIS2<-function(baseurl,username,password) {
url<-paste0(baseurl,"api/me")
r<-GET(url,authenticate(username,password))
warn_for_status(r, task="log in")
if(r$status_code == 200L){return(TRUE)}
}
if(loginDHIS2(baseurl, username, password)==TRUE){
print("successfully logged in")
}else{
stop("could not log in! Please check url, username and password")
}
## [1] "successfully logged in"
#groups
url<-paste0(baseurl, "api/organisationUnitGroups.json?paging=false&fields=id,name,organisationUnits")
ou_groups<-fromJSON(content(GET(url), "text"), flatten = TRUE) %>%
data.frame() %>%
select("name"=1,"id"=2,"members"=3) %>%
unnest_longer(members) %>%
flatten()
head(ou_groups)
## name id members.id
## 1 10/20/31/50 Bed Hospital (not UHC) iXQnUg4Ayjg <NA>
## 2 Baganbari Union, Uttar Mat qzxlprBwoN2 TNlLoHITukJ
## 3 Baganbari Union, Uttar Mat qzxlprBwoN2 qDuy7VCWXEu
## 4 Baganbari Union, Uttar Mat qzxlprBwoN2 Law6euHPccf
## 5 Baganbari Union, Uttar Mat qzxlprBwoN2 kKuwT6hXLqh
## 6 Baganbari Union, Uttar Mat qzxlprBwoN2 oKhlabKlBHa
#stages
url<-paste0(baseurl, "api/programStages.json?paging=false&fields=id,name")
ps<-fromJSON(content(GET(url), "text"), flatten = TRUE) %>%
data.frame() %>%
select("id"=2, "psname"=1)
head(ps)
## id psname
## 1 piRv8jtcLQV ANC 1st visit_
## 2 WZbXY0S00lP ANC 1st visit
## 3 h6idWg9SfPr ANC Follow up sheet
## 4 ViHTJrKKrFg ANC Green File
## 5 tlzRiafqzgd ANCManagements
## 6 iXDSolqmauJ ANCRisks
#OU names
url<-paste0(baseurl, "api/organisationUnits.json?paging=false&fields=id,name")
ou_names<-fromJSON(content(GET(url), "text"), flatten = TRUE) %>%
data.frame() %>%
select("name"=1, "ou_id"=2) %>%
mutate("ou_type"=case_when(
str_detect(name, " FWC") ~ "FWC",
str_detect(name, "(?i)CC") ~ "CC",
str_detect(name, "Unit") ~ "Unit",
str_detect(name, "Ward") ~ "Ward"))
head(ou_names)
## name ou_id ou_type
## 1 Aburkandi FWC mf3RMT3QRhf FWC
## 2 Aithadi Pacani CC, Matlab(North) si7FoHyJpZ0 CC
## 3 Amiapur Cc - Matlab(north) P1okfUePFEd CC
## 4 Aswinpur FWC ubVXGg2leZa FWC
## 5 BAGANBARI UNION, UTTAR MATLAB mR8aUxvBDgQ <NA>
## 6 BAHERCHOR LOTURDI CC PduJMfKZhFt CC
Data Processing
We then merge the raw outputs of TEI events with org unit information
If arrange events by visit number, classify subsequent visits as being at same or different org unit than the initial visit.
my_data<-raw4 %>%
left_join(ou_names, by=c("EvtOrgUnit"="ou_id")) %>%
select(tei=3, ga=8, "ou_id"=EvtOrgUnit, ou_type, name, VisitNo, EnrOrgUnit, StageUid) %>%
distinct() %>% #event must be unique visit, i.e. an ANC management and ANC stage same week would be merged
mutate(ga=round(as.numeric(ga))) %>%
filter(!is.na(ou_type) & !is.na(ga) & ga <= 50 & ga >= 1) %>%
mutate(VisitNo=if_else(VisitNo > 6, "7+", as.character(VisitNo))) %>%
mutate(VisitNo=as.factor(VisitNo)) %>%
mutate(ou_type=recode_factor(ou_type, "FWC"="FWC", "CC"="CC", "Ward"="Ward", "Unit"="Unit" )) %>%
group_by(tei) %>%
add_tally() %>%
arrange(tei, ga) %>%
mutate(first = dplyr::first(ou_id)) %>%
mutate(ga_initial = dplyr::first(ga)) %>%
mutate(last_ou = lag(ou_id)) %>%
mutate(Moved_ou = case_when(first == ou_id & is.na(last_ou) ~ "Event 1",
first == ou_id & !is.na(last_ou) ~ "Event 2+, same ou as Event 1",
first != ou_id ~ "Event 2+, different ou as Event 1")) %>%
mutate(Moved_ou_wrap = str_wrap(Moved_ou, width = 20))
#my_data
head(my_data)
## # A tibble: 6 x 14
## # Groups: tei [4]
## tei ga ou_id ou_type name VisitNo EnrOrgUnit StageUid n first
## <chr> <dbl> <chr> <fct> <chr> <fct> <chr> <chr> <int> <chr>
## 1 A0du~ 21 HWOI~ Ward Ward~ 1 HWOIuGUJV~ Ty22Qt2~ 1 HWOI~
## 2 a0lM~ 40 P1ok~ CC Amia~ 1 P1okfUePF~ Ty22Qt2~ 2 P1ok~
## 3 a0lM~ 40 P1ok~ CC Amia~ 1 P1okfUePF~ WZbXY0S~ 2 P1ok~
## 4 a0Mz~ 25 ZiEo~ Unit Unit~ 1 ZiEozn9m9~ Ty22Qt2~ 1 ZiEo~
## 5 a0RW~ 18 uMnk~ Ward Ward~ 1 uMnkQBPF2~ Ty22Qt2~ 2 uMnk~
## 6 a0RW~ 33 Sr8A~ FWC Shak~ 1 uMnkQBPF2~ WZbXY0S~ 2 uMnk~
## # ... with 4 more variables: ga_initial <dbl>, last_ou <chr>, Moved_ou <chr>,
## # Moved_ou_wrap <chr>
Visualization
Let’s start simple with a histogram. GA of each visit, by OU type.
p<-ggplot(my_data, aes(ga))+
geom_histogram(bins=25)+
facet_wrap(~ou_type, ncol=1)+
labs(title="Events by GA and OU type")
p

Density Plot by Org Unit type and Gestational Age
Another way to show this histogram is a density dot plot. To make it interesting we can animate it, to emphasize this progression of time. It looks a bit like a paint roller…
p<-ggplot(my_data, aes(ga, ou_type)) +
geom_jitter(aes(group = ga, size = .3), height = 0.25, show.legend = FALSE) +
labs(title="Pregnancy events in e-Reg Matlab by Org Unit",
subtitle = 'Visits at Gestational Age {closest_state}',
y = 'Org Unit Type') +
# scale_colour_manual(values = col_scale) +
transition_states(ga, transition_length = 3, state_length = 2) +
shadow_mark(size = .5) +
ease_aes('linear')
animate(
plot = p,
nframes = 200,
duration = 15,
end_pause = 50
)

But this doesnt say much about patient movement. When do clients move to a different org unit clinic?
The red dots are the GA and location (org unit) of first event (identification).
The blue dots are subsequent events that are at same location as first event.
The green dots are movement to a DIFFERENT location than the identification org unit.
p<-ggplot(my_data, aes(ga, Moved_ou_wrap)) +
geom_jitter(aes(group = ga, color = Moved_ou_wrap), size = 0.01) +
facet_wrap(~ou_type, ncol = 1)+
theme(legend.position = "none") +
labs(title="Pregnancy events in e-Reg Matlab by Org Unit") +
ylab("")
p

Same thing, but animated…
p<-p +
labs(title="Pregnancy events in e-Reg Matlab by Org Unit",
subtitle = 'Visits at Gestational Age {closest_state}') +
theme(legend.position = "none") +
transition_states(ga, transition_length = 3, state_length = 2) +
shadow_mark(size = .3) +
ease_aes('linear')
animate(
plot = p,
nframes = 200,
duration = 15,
end_pause = 50
)

Now we can see that most movement to new place happens after 36 weeks for FWA units and HA Wards (Home PPC follow up for those not identified by FWA). But women move to FWC between 14 and 35 weeks. Fewer women choose to move to CC, if they were identified elsewhere.
We want to narrow in on the patients who start at one org unit, and receive services at another. This approach tells us what kind of org unit they GO to, but not what kind of org unit they COME from.
Heat Map
If we narrow in on the patients who move to a new location (the green dots above), we can see the overlap of service provision between types of org unit.
Here is a table of events that are at a different org unit than the enrollment org unit, arranged by org unit type.
pcords_data <-my_data %>%
ungroup() %>%
filter(Moved_ou=="Event 2+, different ou as Event 1") %>%
rename("event_ou_id"=ou_id, "event_ou_type"=ou_type) %>%
left_join(ou_names, by=c("EnrOrgUnit"="ou_id")) %>%
rename("enr_ou_type"=ou_type) %>%
select(enr_ou_type, event_ou_type, "event_ga"=ga, "enr_ga"=ga_initial)
test<-pcords_data %>%
arrange(event_ou_type) %>%
mutate(event_ou_type=factor(event_ou_type, levels=c("CC","FWC","Unit","Ward"))) %>%
group_by(enr_ou_type, event_ou_type) %>%
select("enrollment OU"=enr_ou_type, "event OU"=event_ou_type) %>%
summarize(count=n())
kable(test) %>%
kable_styling()
|
enrollment OU
|
event OU
|
count
|
|
CC
|
CC
|
4
|
|
CC
|
FWC
|
29
|
|
CC
|
Unit
|
9
|
|
FWC
|
CC
|
11
|
|
FWC
|
FWC
|
8
|
|
FWC
|
Unit
|
10
|
|
Unit
|
CC
|
100
|
|
Unit
|
FWC
|
298
|
|
Unit
|
Unit
|
27
|
|
Unit
|
Ward
|
16
|
|
Ward
|
CC
|
14
|
|
Ward
|
FWC
|
73
|
|
Ward
|
Unit
|
4
|
We can visualize this graph in a heatmap
# Give extreme colors:
library(viridis)
ggplot(test, aes(`enrollment OU`, `event OU`, fill= count)) +
geom_tile() +
scale_fill_viridis(discrete=FALSE) +
theme_minimal()

We can facet these down by enrollment OU, then show the event GA for each subsequent event.
heat2<-pcords_data %>%
mutate(event_ou_type=factor(event_ou_type, levels=c("CC","FWC","Unit","Ward"))) %>%
mutate(gestage_event=case_when(
event_ga >= 0 & event_ga < 18 ~ "0-17",
event_ga >= 18 & event_ga < 24 ~ "18-23",
event_ga >= 24 & event_ga < 29 ~ "24-29",
event_ga >= 29 & event_ga < 34 ~ "29-33",
event_ga >= 34 & event_ga < 40 ~ "34-39",
event_ga >= 40 ~ "40+",
)) %>%
group_by(enr_ou_type, event_ou_type, gestage_event) %>%
select("enrollment OU"=enr_ou_type, "event OU"=event_ou_type, gestage_event) %>%
summarize(count=n())
ggplot(heat2, aes(gestage_event, `event OU`, fill= count)) +
geom_tile() +
scale_fill_viridis(discrete=FALSE) +
facet_wrap(~`enrollment OU`,labeller = "label_both")+
labs(title="Events at different OU than enrollment",
subtitle="By Event Gest Age and OU Type")

Animated Dot Plot
What are the patterns over time though?
In this animation, we focus on the “green dots” above. Each green dot is an event. The horizontal lines represent a type of org unit. The middle grey dots represent the identification events–once they cross that dot, the pregnancy is identified.
After that, clients go to many other types of org units. Some go to a different org unit of same type, while others go to a new org unit type.
By animating this over gestational age at visit, we can see which weeks had highest “crossover” of events. The speed of dot movement represents the time between visits.
pcord2<-pcords_data %>%
rownames_to_column(var="id") %>%
mutate("enr_start"=enr_ou_type) %>%
pivot_longer(c('enr_start', 'enr_ou_type','event_ou_type'),
names_to = "start_finish",
values_to="ou_type") %>%
mutate(gestage=if_else(start_finish=="enr_ou_type", enr_ga,
if_else(start_finish=="event_ou_type", event_ga, 0))) %>%
mutate(endpoint=if_else(start_finish=="enr_start", 0,
if_else(start_finish=="enr_ou_type", 1, 2))) %>%
mutate(ou_type=factor(ou_type, levels = c("Unit","Ward","FWC","CC"))) %>%
mutate(gestage=if_else(endpoint==2 & enr_ga > 30 & gestage > 30, gestage + 2, gestage)) %>%
mutate(gestage=if_else(endpoint==2, gestage + 1, gestage))
pcord_summ<-pcord2 %>%
filter(endpoint!=0) %>%
group_by(endpoint, ou_type, gestage) %>%
summarise("count"=n_distinct(id)) %>%
mutate("cumsum"=cumsum(count))
#pcord2 %>% filter(endpoint==2 & gestage > 35)
ps1<-pcord_summ %>% filter(endpoint==1)
ps2<-pcord_summ %>% filter(endpoint==2)
p2<-ggplot() +
geom_point(data = pcord2, aes(x=endpoint, y = ou_type,
group = id), col = "green") +
geom_point(data = ps1, aes(x=endpoint, y = ou_type, size = cumsum),
col="grey", alpha=0.8) +
geom_point(data = ps2, aes(x=endpoint, y = ou_type, size = cumsum),
col="grey", alpha=0.8) +
theme_minimal() +
transition_reveal(gestage) +
scale_x_continuous(breaks=c(0, 1, 2),
labels=c("GA 0","Identification", "Other Event")) +
labs(title="e-Reg Matlab -- Events at Different Org Unit than Identification",
subtitle = 'Events at Gestational Age {round(frame_along)}',
x = "")
animate(p2,
nframes = 200,
duration = 15,
end_pause = 50)

If a dot moves quickly between identification and subsequent visit, that means that the next visit happened quickly after identification (e.g., Unit identifies pregnancy at 16 weeks, visit to CHCP at 18 weeks). Inversely, the second visit may occur long after identification (same client visits CHCP again at 32 weeks, this would be a slower moving dot).
Eventually want to recreate as a Sankey flow diagram across 4 ANC visits. See below.
Parallel Coordinates
If we want to explore these relationships further, we can use interactive visualization.
The type below is called parallel coordinates.
The vertical axes represent variables. The horizontal and diagonal lines are observations, where each is an EVENT that took at a different place than the enrollment org unit. The colors are based on the type of enrollment org unit.
You can click and drag across an axis to select a range for each variable, and it will filter down to the observations that meet that criteria. For example, of those clients who were enrolled at a WARD, and later went to a CC, what was the range of gestational ages when that CC visit took place?
##### Parallel Coordinates Graph ######
library(parcoords)
#parallel coordinates with color based on gender
parcoords::parcoords(data = pcords_data,
rownames = TRUE,
color = list(
# discrete or categorical column
colorScale = "scaleOrdinal",
colorBy = "enr_ou_type",
colorScheme = "schemeCategory10"),
withD3 = TRUE,
brushMode = "1D-axes-multi",
alphaOnBrushed = 0.2,
queue = TRUE,
rate = 50,
reorderable = TRUE)
To make the correlations easier to read, you can drag and rearrange the axes order.
We might expand on this by linking to the selected ranges to tables, for dynamic filtering of data.
Dropouts: Isolating Clients with only one event
If the client has only one event in system, maybe they are different for some reason than the other events.
For example, what stage was their only event?
What kind of org unit?
The below tables are only one event.
First is by org unit – most of these are the Pregnancy ID stage.
library(kableExtra)
###Isolate those that only havd one event
my_data_iso<-raw4 %>%
left_join(ou_names, by=c("EvtOrgUnit"="ou_id")) %>%
select(tei=3, ga=8, "ou_id"=EvtOrgUnit, ou_type, name, VisitNo, EnrOrgUnit, StageUid) %>%
left_join(ps, by = c("StageUid"="id")) %>%
mutate(ga=round(as.numeric(ga))) %>%
filter(!is.na(ou_type) & !is.na(ga) &
ga <= 50 & ga >= 1 &
str_detect(psname, paste(c("regnanc", "ANC", "Newborn","PNC","Lab"),collapse = '|')) &
!str_detect(psname, paste(c("Prev","Risk","Manag"),collapse = '|'))) %>%
group_by(tei) %>%
add_tally() %>%
arrange(tei, ga) %>%
mutate(first = dplyr::first(ou_id)) %>%
mutate(last_ou = lag(ou_id)) %>%
ungroup() %>%
mutate(Moved_ou = case_when(first == ou_id & is.na(last_ou) ~ "Event 1",
first == ou_id & !is.na(last_ou) ~ "Event 2+, same ou as Event 1",
first != ou_id ~ "Event 2+, different ou as Event 1"))
# my_data_iso %>%
# group_by(ou_type, n) %>%
# summarise("events"= n()) %>%
# mutate(percent = round(events/sum(events), 2))
#my_data_iso
test2<-my_data_iso %>%
group_by(tei) %>%
filter(n==1) %>%
group_by(psname) %>%
summarize("stage_count"=n())
test3<-my_data_iso %>%
filter(n==1) %>%
group_by(ou_type) %>%
summarize("ou_type_count"=n())
#kableExtra::kable(test2)
kable(test2) %>%
kable_styling()
|
psname
|
stage_count
|
|
ANC 1st visit_
|
2
|
|
গর্ভকালীন সেবার তথ্য (Home - ANC record)
|
4
|
|
গর্ভাবস্থার সনাক্তকরণ (Pregnancy identification)
|
2775
|
|
ANC 1st visit
|
1
|
|
ANC visit
|
6
|
|
ANC visit_
|
2
|
Next is by org unit. Most of these are the FWA Units.
kable(test3) %>%
kable_styling()
|
ou_type
|
ou_type_count
|
|
CC
|
72
|
|
FWC
|
46
|
|
Unit
|
1887
|
|
Ward
|
785
|
Cascade by Visit Number
This section shows ALL VISITS for each patient We can visualize the visit number (x axis), GA at visit (time), and type of org unit at each visit (y axis). The inspiration is this NYTimes infographic
Each dot represents a patient as she moves through each level of the health system. The patient’s dot “rests” at the location of the last visit recorded. Note that the this considers each identification, ANC visit, or home visit stage a separate “visit”.
You can see that very few patients get past the 3rd visit at any level. Most of the migration occurs from the FWA Unit level up the system to FWC. And compared to CC level, more clients who made it to FWC by the third visit started from another org unit (red dots).
cascade_data <-my_data %>%
ungroup() %>%
rename("event_ou_id"=ou_id, "event_ou_type"=ou_type) %>%
select(tei, event_ou_type, "event_ga"=ga, Moved_ou, StageUid) %>%
left_join(ps, by = c("StageUid"="id")) %>%
filter(!is.na(event_ou_type) & !is.na(event_ga) &
event_ga <= 50 & event_ga >= 1 &
str_detect(psname, paste(c("regnanc", "ANC"),collapse = '|')) &
!str_detect(psname, paste(c("Prev","Risk","Manag", "Out"),collapse = '|'))) %>%
group_by(tei) %>%
mutate("VisitNo"=if_else(str_detect(psname, "ident"), 1, 2)) %>%
arrange(VisitNo, event_ga) %>%
mutate("VisitNo"=row_number())
cascade_data<-cascade_data %>%
mutate("VisitNo"=if_else(VisitNo==1, 0, as.double(VisitNo))) %>%
bind_rows(cascade_data %>% filter(VisitNo == 1)) %>%
mutate("Moved_ou"=if_else(VisitNo <=1, "Event 1", Moved_ou)) %>%
mutate(event_ou_type=factor(event_ou_type, levels = c("Unit","Ward","FWC","CC"))) %>%
mutate(groupid= group_indices()) %>%
arrange(tei, VisitNo) %>%
mutate(gestage=if_else(VisitNo==0, 0, event_ga)) %>%
filter(VisitNo<8) %>%
mutate(gestage=if_else(gestage==lag(gestage) & groupid==lag(groupid) & gestage!=0,
gestage+2, gestage))
p3<-ggplot() +
geom_jitter(data = cascade_data, aes(x=VisitNo, y = event_ou_type,
group = groupid, col = Moved_ou),
size=0.5, width = 0.15, height=0.1) +
theme_minimal() +
transition_reveal(gestage) +
scale_x_continuous(breaks=c(0:7),
labels=c("GA 0", "Event1", "Event2",
"Event3","Event4","Event5", "Event6","Event7"),
minor_breaks = NULL) +
scale_colour_manual(values = c("grey", "red", "darkblue")) +
labs(title="e-Reg Matlab -- Events 1-7",
subtitle = 'Events at Gestational Age {round(frame_along)}',
x = "",
y="")+
theme(legend.position = "bottom")
animate(p3,
nframes = 200,
duration = 15,
end_pause = 50)

#gganimate::anim_save("bd_visit1to7.gif")
---
title: "Client Movement in e-Reg Matlab: Visual Analysis"
author: "Brian O'Donnell"
date: "5/8/2020"
output: 
  html_document:
    number_sections: true
    code_folding: hide
    code_download: yes
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: false
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(dplyr)
library(tidyverse)
library(readr)
library(ggplot2)
library(gganimate)
library(httr)
library(jsonlite)
library(assertthat)
library(kableExtra)
library(viridis)
password<-read_lines("passw.txt")

```

# Data Import and Processing

These analyses pull data from production and development DHIS2 environments to assess client movement.

Data look like this:

```{r prod data}

#read file from production and view

raw4<-suppressMessages(read_csv("BrianDs.csv"))

#View(raw4)
#colnames(raw4)

head(raw4)

```


Data from development environment

```{r dev data}
#now get background data from dev
#get Org Unit groups
baseurl<-"https://bd-eregistry.dhis2.org/dhis/"
username<-"ing_test"


#function for logging in
loginDHIS2<-function(baseurl,username,password) {
  url<-paste0(baseurl,"api/me")
  r<-GET(url,authenticate(username,password))
  warn_for_status(r, task="log in")
  if(r$status_code == 200L){return(TRUE)}
}


if(loginDHIS2(baseurl, username, password)==TRUE){
  print("successfully logged in")
}else{
  stop("could not log in! Please check url, username and password")
}

#groups
url<-paste0(baseurl, "api/organisationUnitGroups.json?paging=false&fields=id,name,organisationUnits")
ou_groups<-fromJSON(content(GET(url), "text"), flatten = TRUE) %>% 
  data.frame() %>% 
  select("name"=1,"id"=2,"members"=3) %>% 
  unnest_longer(members) %>% 
  flatten()
head(ou_groups)
  
#stages
url<-paste0(baseurl, "api/programStages.json?paging=false&fields=id,name")
ps<-fromJSON(content(GET(url), "text"), flatten = TRUE) %>% 
  data.frame() %>% 
  select("id"=2, "psname"=1)
head(ps)

#OU names
url<-paste0(baseurl, "api/organisationUnits.json?paging=false&fields=id,name")
ou_names<-fromJSON(content(GET(url), "text"), flatten = TRUE) %>% 
  data.frame() %>% 
  select("name"=1, "ou_id"=2) %>% 
  mutate("ou_type"=case_when(
    str_detect(name, " FWC") ~ "FWC",
    str_detect(name, "(?i)CC") ~ "CC",
    str_detect(name, "Unit") ~ "Unit",
    str_detect(name, "Ward") ~ "Ward"))

head(ou_names)
```

## Data Processing

We then merge the raw outputs of TEI events with org unit information

If arrange events by visit number, classify subsequent visits as being at same or different org unit than the initial visit.

```{r processing}

my_data<-raw4 %>% 
  left_join(ou_names, by=c("EvtOrgUnit"="ou_id")) %>% 
  select(tei=3, ga=8, "ou_id"=EvtOrgUnit, ou_type, name, VisitNo, EnrOrgUnit, StageUid) %>% 
  distinct() %>% #event must be unique visit, i.e. an ANC management and ANC stage same week would be merged
  mutate(ga=round(as.numeric(ga))) %>% 
  filter(!is.na(ou_type) & !is.na(ga) & ga <= 50 & ga >= 1) %>% 
  mutate(VisitNo=if_else(VisitNo > 6, "7+", as.character(VisitNo))) %>% 
  mutate(VisitNo=as.factor(VisitNo)) %>% 
  mutate(ou_type=recode_factor(ou_type, "FWC"="FWC", "CC"="CC", "Ward"="Ward", "Unit"="Unit" )) %>% 
  group_by(tei) %>% 
  add_tally() %>% 
  arrange(tei, ga) %>% 
  mutate(first = dplyr::first(ou_id)) %>%
  mutate(ga_initial = dplyr::first(ga)) %>% 
  mutate(last_ou = lag(ou_id)) %>% 
  mutate(Moved_ou = case_when(first == ou_id & is.na(last_ou)  ~ "Event 1",
                              first == ou_id & !is.na(last_ou) ~ "Event 2+, same ou as Event 1", 
                              first != ou_id ~ "Event 2+, different ou as Event 1")) %>% 
  mutate(Moved_ou_wrap = str_wrap(Moved_ou, width = 20))

#my_data

head(my_data)

```

# Visualization 

Let's start simple with a histogram. GA of each visit, by OU type.

```{r hist basic}
p<-ggplot(my_data, aes(ga))+
  geom_histogram(bins=25)+
  facet_wrap(~ou_type, ncol=1)+
  labs(title="Events by GA and OU type")
p

```


## Density Plot by Org Unit type and Gestational Age

Another way to show this histogram is a density dot plot. To make it interesting we can animate it,
to emphasize this progression of time. It looks a bit like a paint roller...

```{r density anim}

p<-ggplot(my_data, aes(ga, ou_type)) +
  geom_jitter(aes(group = ga, size = .3), height = 0.25, show.legend = FALSE) +
  labs(title="Pregnancy events in e-Reg Matlab by Org Unit",
       subtitle = 'Visits at Gestational Age {closest_state}',
       y = 'Org Unit Type') +
 # scale_colour_manual(values = col_scale) +
  transition_states(ga, transition_length = 3, state_length = 2) +
  shadow_mark(size = .5) +
  ease_aes('linear')

animate(
  plot = p, 
  nframes = 200,
  duration = 15,
  end_pause = 50
)



```

But this doesnt say much about patient movement. When do clients move to a different org unit clinic?

* The red dots are the GA and location (org unit) of first event (identification). 

* The blue dots are subsequent events that are at same location as first event. 

* The green dots are movement to a DIFFERENT location than the identification org unit.

```{r density with colors}
p<-ggplot(my_data, aes(ga, Moved_ou_wrap)) +
  geom_jitter(aes(group = ga, color = Moved_ou_wrap), size = 0.01) +
  facet_wrap(~ou_type, ncol = 1)+
    theme(legend.position = "none") +
    labs(title="Pregnancy events in e-Reg Matlab by Org Unit") +
    ylab("")

p

```


Same thing, but animated...

```{r animated paint roller}
p<-p +
  labs(title="Pregnancy events in e-Reg Matlab by Org Unit",
       subtitle = 'Visits at Gestational Age {closest_state}') +
  theme(legend.position = "none") +
  transition_states(ga, transition_length = 3, state_length = 2) +
  shadow_mark(size = .3) +
  ease_aes('linear')

animate(
  plot = p, 
  nframes = 200,
  duration = 15,
  end_pause = 50
)


```



Now we can see that most movement to new place happens after 36 weeks for FWA units and HA Wards (Home PPC follow up for those not identified by FWA). But women move to FWC between 14 and 35 weeks. Fewer women choose to move to CC, if they were identified elsewhere.


We want to narrow in on the patients who start at one org unit, and receive services at another. This approach tells us what kind of org unit they GO to, but not what kind of org unit they COME from.

## Chord diagram
[Chord diagram]( https://bost.ocks.org/mike/uberdata/) might be helpful too here - to do.



## Heat Map

If we narrow in on the patients who move to a new location (the green dots above), we can see the overlap of service provision between types of org unit.

Here is a table of events that are at a *different org unit* than the enrollment org unit, arranged by org unit type.

```{r pcords data}
pcords_data <-my_data %>% 
  ungroup() %>% 
  filter(Moved_ou=="Event 2+, different ou as Event 1") %>% 
  rename("event_ou_id"=ou_id, "event_ou_type"=ou_type) %>% 
  left_join(ou_names, by=c("EnrOrgUnit"="ou_id")) %>% 
  rename("enr_ou_type"=ou_type) %>%
  select(enr_ou_type, event_ou_type, "event_ga"=ga, "enr_ga"=ga_initial) 


test<-pcords_data %>%
  arrange(event_ou_type) %>% 
  mutate(event_ou_type=factor(event_ou_type, levels=c("CC","FWC","Unit","Ward"))) %>% 
  group_by(enr_ou_type, event_ou_type) %>% 
  select("enrollment OU"=enr_ou_type, "event OU"=event_ou_type) %>% 
  summarize(count=n())

kable(test) %>%
  kable_styling()


```


We can visualize this graph in a heatmap

```{r heatmap 1}
# Give extreme colors:
library(viridis)

ggplot(test, aes(`enrollment OU`, `event OU`, fill= count)) + 
  geom_tile() +
  scale_fill_viridis(discrete=FALSE) +
  theme_minimal()

```

We can facet these down by enrollment OU, then show the event GA for each subsequent event.

```{r heatmap 2}

heat2<-pcords_data %>%
    mutate(event_ou_type=factor(event_ou_type, levels=c("CC","FWC","Unit","Ward"))) %>% 
  mutate(gestage_event=case_when(
    event_ga >= 0 & event_ga < 18 ~ "0-17",
    event_ga >= 18 & event_ga < 24 ~ "18-23",
    event_ga >= 24 & event_ga < 29 ~ "24-29",
    event_ga >= 29 & event_ga < 34 ~ "29-33",
    event_ga >= 34 & event_ga < 40 ~ "34-39",
                     event_ga >= 40 ~ "40+",
  )) %>% 
  group_by(enr_ou_type, event_ou_type, gestage_event) %>% 
  select("enrollment OU"=enr_ou_type, "event OU"=event_ou_type, gestage_event) %>% 
  summarize(count=n())


ggplot(heat2, aes(gestage_event, `event OU`, fill= count)) + 
  geom_tile() +
  scale_fill_viridis(discrete=FALSE) +
  facet_wrap(~`enrollment OU`,labeller = "label_both")+
  labs(title="Events at different OU than enrollment",
       subtitle="By Event Gest Age and OU Type")

```


## Animated Dot Plot


What are the patterns over time though? 

In this animation, we focus on the "green dots" above. Each green dot is an event. The horizontal lines represent a type of org unit. The middle grey dots represent the identification events--once they cross that dot, the pregnancy is identified.

After that, clients go to many other types of org units. Some go to a different org unit of same type, while others go to a new org unit type.

By animating this over gestational age at visit, we can see which weeks had highest "crossover" of events. The speed of dot movement represents the time between visits. 

```{r animation}

pcord2<-pcords_data %>% 
rownames_to_column(var="id") %>% 
mutate("enr_start"=enr_ou_type) %>% 
pivot_longer(c('enr_start', 'enr_ou_type','event_ou_type'), 
             names_to = "start_finish", 
             values_to="ou_type") %>% 
  mutate(gestage=if_else(start_finish=="enr_ou_type", enr_ga, 
                          if_else(start_finish=="event_ou_type", event_ga, 0))) %>% 
  mutate(endpoint=if_else(start_finish=="enr_start", 0,
                          if_else(start_finish=="enr_ou_type", 1, 2))) %>% 
  mutate(ou_type=factor(ou_type, levels = c("Unit","Ward","FWC","CC"))) %>% 
  mutate(gestage=if_else(endpoint==2 & enr_ga > 30 & gestage > 30, gestage + 2, gestage)) %>% 
  mutate(gestage=if_else(endpoint==2, gestage + 1, gestage))


pcord_summ<-pcord2 %>% 
  filter(endpoint!=0) %>% 
  group_by(endpoint, ou_type, gestage) %>% 
  summarise("count"=n_distinct(id)) %>% 
  mutate("cumsum"=cumsum(count))

#pcord2 %>% filter(endpoint==2 & gestage > 35)


ps1<-pcord_summ %>% filter(endpoint==1)
ps2<-pcord_summ %>% filter(endpoint==2)

p2<-ggplot() +
  geom_point(data = pcord2, aes(x=endpoint, y = ou_type, 
                                group = id), col = "green")  +
  geom_point(data = ps1, aes(x=endpoint, y = ou_type, size = cumsum),
                                col="grey", alpha=0.8) +
  geom_point(data = ps2, aes(x=endpoint, y = ou_type, size = cumsum),
                               col="grey", alpha=0.8) +
  theme_minimal() + 
  transition_reveal(gestage) +
  scale_x_continuous(breaks=c(0, 1, 2),
                   labels=c("GA 0","Identification", "Other Event")) +
  labs(title="e-Reg Matlab -- Events at Different Org Unit than Identification",
       subtitle = 'Events at Gestational Age {round(frame_along)}',
       x = "")

animate(p2,
        nframes = 200,
        duration = 15,
        end_pause = 50)
```

If a dot moves quickly between identification and subsequent visit, that means that the next visit happened quickly after identification (e.g., Unit identifies pregnancy at 16 weeks, visit to CHCP at 18 weeks). Inversely, the second visit may occur long after identification (same client visits CHCP again at 32 weeks, this would be a slower moving dot).

Eventually want to recreate as a [Sankey flow diagram](https://www.hvitfeldt.me/blog/recreate-sankey-flow-chart/) across 4 ANC visits. See below.



## Parallel Coordinates

If we want to explore these relationships further, we can use interactive visualization.

The type below is called *parallel coordinates.* 

The vertical axes represent variables. The horizontal and diagonal lines are observations, where each is an EVENT that took at a different place than the enrollment org unit. The colors are based on the type of enrollment org unit.

You can click and drag across an axis to select a range for each variable, and it will filter down to the observations that meet that criteria. For example, of those clients who were enrolled at a WARD, and later went to a CC, what was the range of gestational ages when that CC visit took place?

```{r parcoords}
##### Parallel Coordinates Graph ######
library(parcoords)
#parallel coordinates with color based on gender

parcoords::parcoords(data = pcords_data,
                     rownames = TRUE,
                     color = list(
                       # discrete or categorical column
                       colorScale = "scaleOrdinal",
                       colorBy = "enr_ou_type",
                       colorScheme = "schemeCategory10"),
                     withD3 = TRUE,
                     brushMode = "1D-axes-multi",
                     alphaOnBrushed = 0.2,
                     queue = TRUE,
                     rate = 50,
                     reorderable = TRUE)


```

To make the correlations easier to read, you can drag and rearrange the axes order.

We might expand on this by linking to the selected ranges to tables, for dynamic filtering of data.



# Dropouts: Isolating Clients with only one event

If the client has only one event in system, maybe they are different for some reason than the other events.

For example, what stage was their only event?

What kind of org unit?

The below tables are only one event. 

First is by org unit -- most of these are the Pregnancy ID stage.

```{r kable}
library(kableExtra)
###Isolate those that only havd one event
my_data_iso<-raw4 %>% 
  left_join(ou_names, by=c("EvtOrgUnit"="ou_id")) %>% 
  select(tei=3, ga=8, "ou_id"=EvtOrgUnit, ou_type, name, VisitNo, EnrOrgUnit, StageUid) %>%
  left_join(ps, by = c("StageUid"="id")) %>% 
  mutate(ga=round(as.numeric(ga))) %>% 
  filter(!is.na(ou_type) & !is.na(ga) & 
           ga <= 50 & ga >= 1 &
          str_detect(psname, paste(c("regnanc", "ANC", "Newborn","PNC","Lab"),collapse = '|')) &
          !str_detect(psname, paste(c("Prev","Risk","Manag"),collapse = '|'))) %>% 
  group_by(tei) %>% 
  add_tally() %>% 
  arrange(tei, ga) %>% 
  mutate(first = dplyr::first(ou_id)) %>%
  mutate(last_ou = lag(ou_id)) %>%
  ungroup() %>% 
  mutate(Moved_ou = case_when(first == ou_id & is.na(last_ou)  ~ "Event 1",
                              first == ou_id & !is.na(last_ou) ~ "Event 2+, same ou as Event 1", 
                              first != ou_id ~ "Event 2+, different ou as Event 1"))

# my_data_iso %>% 
#   group_by(ou_type, n) %>% 
#   summarise("events"= n()) %>% 
#   mutate(percent = round(events/sum(events), 2))

#my_data_iso

test2<-my_data_iso %>% 
  group_by(tei) %>% 
  filter(n==1) %>% 
  group_by(psname) %>% 
  summarize("stage_count"=n())

test3<-my_data_iso %>% 
  filter(n==1) %>% 
  group_by(ou_type) %>% 
  summarize("ou_type_count"=n())

#kableExtra::kable(test2)
kable(test2) %>% 
  kable_styling()

```

Next is by org unit. Most of these are the FWA Units.

```{r}
kable(test3) %>% 
  kable_styling()



```

# Cascade by Visit Number

This section shows *ALL VISITS* for each patient We can visualize the visit number (x axis), GA at visit (time), and type of org unit at each visit (y axis). The inspiration is this [NYTimes infographic](https://www.nytimes.com/interactive/2018/03/19/upshot/race-class-white-and-black-men.html)

Each dot represents a patient as she moves through each level of the health system. The patient's dot "rests" at the location of the last visit recorded. Note that the this considers each identification, ANC visit, or home visit stage a separate "visit". 

You can see that very few patients get past the 3rd visit at any level. Most of the migration occurs from the FWA Unit level up the system to FWC. And compared to CC level, more clients who made it to FWC by the third visit started from another org unit (red dots).

```{r}

cascade_data <-my_data %>% 
  ungroup() %>% 
  rename("event_ou_id"=ou_id, "event_ou_type"=ou_type) %>% 
  select(tei, event_ou_type, "event_ga"=ga, Moved_ou, StageUid) %>% 
  left_join(ps, by = c("StageUid"="id")) %>% 
  filter(!is.na(event_ou_type) & !is.na(event_ga) & 
           event_ga <= 50 & event_ga >= 1 &
          str_detect(psname, paste(c("regnanc", "ANC"),collapse = '|')) &
          !str_detect(psname, paste(c("Prev","Risk","Manag", "Out"),collapse = '|'))) %>% 
  group_by(tei) %>% 
  mutate("VisitNo"=if_else(str_detect(psname, "ident"), 1, 2)) %>% 
  arrange(VisitNo, event_ga) %>% 
  mutate("VisitNo"=row_number()) 

cascade_data<-cascade_data %>% 
  mutate("VisitNo"=if_else(VisitNo==1, 0, as.double(VisitNo))) %>% 
  bind_rows(cascade_data %>%  filter(VisitNo == 1)) %>% 
  mutate("Moved_ou"=if_else(VisitNo <=1, "Event 1", Moved_ou)) %>% 
  mutate(event_ou_type=factor(event_ou_type, levels = c("Unit","Ward","FWC","CC"))) %>% 
  mutate(groupid= group_indices()) %>% 
  arrange(tei, VisitNo) %>% 
  mutate(gestage=if_else(VisitNo==0, 0, event_ga)) %>% 
  filter(VisitNo<8) %>% 
  mutate(gestage=if_else(gestage==lag(gestage) & groupid==lag(groupid) & gestage!=0, 
                         gestage+2, gestage))



p3<-ggplot() +
  geom_jitter(data = cascade_data, aes(x=VisitNo, y = event_ou_type, 
                                group = groupid, col = Moved_ou), 
                                size=0.5, width = 0.15, height=0.1)  +
  theme_minimal() + 
  transition_reveal(gestage) +
  scale_x_continuous(breaks=c(0:7),
                   labels=c("GA 0", "Event1", "Event2",
                            "Event3","Event4","Event5", "Event6","Event7"),
                   minor_breaks = NULL) +
  scale_colour_manual(values = c("grey", "red", "darkblue")) +
  labs(title="e-Reg Matlab -- Events 1-7",
       subtitle = 'Events at Gestational Age {round(frame_along)}',
       x = "",
       y="")+
  theme(legend.position = "bottom")


animate(p3,
        nframes = 200,
        duration = 15,
        end_pause = 50)

#gganimate::anim_save("bd_visit1to7.gif")



```







